{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import pandas as pd\n",
    "import time\n",
    "import numpy as np\n",
    "\n",
    "from nltk.tokenize import TweetTokenizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Twitter chat corpus preprocessing\n",
    "\n",
    "dataset source: https://github.com/Marsan-Ma/chat_corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sentence_to_ids(sentence, dictionary):\n",
    "    \"\"\"\n",
    "    Transform sentence into ids and record new words\n",
    "        sentence: [word_0, word_1, ...]\n",
    "        dictionary: {word: [id, frequency]}\n",
    "    \"\"\"\n",
    "    ids = []\n",
    "    for i in range(len(sentence)):\n",
    "        # strip hashtag's #\n",
    "        if sentence[i][0] == \"#\":\n",
    "            sentence[i] = sentence[i][1:]\n",
    "\n",
    "        if sentence[i] not in dictionary:\n",
    "            dictionary[sentence[i]] = [len(dictionary), 0]\n",
    "\n",
    "        # count word frequency\n",
    "        dictionary[sentence[i]][1] += 1\n",
    "\n",
    "        ids.append(dictionary[sentence[i]][0])\n",
    "    \n",
    "    return sentence, ids\n",
    "\n",
    "\n",
    "def tokenize_and_build_dictionary(sentences, print_every=100000):\n",
    "    \"\"\"\n",
    "    NLTK Tweet: Tokenizations for training data, build a dictionary\n",
    "        sentences: [sentence_0, sentence_1, ...]\n",
    "    Returns:\n",
    "        data: list of {message:[...], m_ids:[...], response:[...], r_ids:[...]}\n",
    "        dictionary: {word: [id, frequency]}\n",
    "    \"\"\"\n",
    "    twtknzr = TweetTokenizer(reduce_len=True, strip_handles=True)\n",
    "    data = []\n",
    "    dictionary = {}\n",
    "\n",
    "    # Build dictionary\n",
    "    st = time.time()\n",
    "    for i in range(0, len(sentences), 2):\n",
    "        pair = {}\n",
    "        message = twtknzr.tokenize(sentences[i])\n",
    "        response = twtknzr.tokenize(sentences[i + 1])\n",
    "\n",
    "        # transform text into token_id sequence\n",
    "        pair[\"message\"], pair[\"m_ids\"] = sentence_to_ids(message, dictionary)\n",
    "        pair[\"response\"], pair[\"r_ids\"] = sentence_to_ids(response, dictionary)\n",
    "\n",
    "        if i % print_every == 0:\n",
    "            print(\"Processed {} sentences, used {:.4f} seconds.\".format(\n",
    "                  i, time.time() - st))\n",
    "\n",
    "        data.append(pair)\n",
    "\n",
    "    return data, dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"twitter_en_big.txt\", header=None, sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed 0 sentences, used 0.0006 seconds.\n",
      "Processed 100000 sentences, used 6.8714 seconds.\n",
      "Processed 200000 sentences, used 13.2371 seconds.\n",
      "Processed 300000 sentences, used 20.8553 seconds.\n",
      "Processed 400000 sentences, used 28.4578 seconds.\n",
      "Processed 500000 sentences, used 35.2528 seconds.\n",
      "Processed 600000 sentences, used 42.3493 seconds.\n",
      "Processed 700000 sentences, used 49.5808 seconds.\n",
      "Processed 800000 sentences, used 56.4027 seconds.\n",
      "Processed 900000 sentences, used 62.9982 seconds.\n",
      "Processed 1000000 sentences, used 70.3672 seconds.\n",
      "Processed 1100000 sentences, used 77.9817 seconds.\n",
      "Processed 1200000 sentences, used 84.6329 seconds.\n",
      "Processed 1300000 sentences, used 91.7561 seconds.\n",
      "Processed 1400000 sentences, used 99.6297 seconds.\n",
      "Processed 1500000 sentences, used 106.4259 seconds.\n",
      "Processed 1600000 sentences, used 113.4650 seconds.\n",
      "Processed 1700000 sentences, used 120.1971 seconds.\n",
      "Processed 1800000 sentences, used 128.0018 seconds.\n",
      "Processed 1900000 sentences, used 134.6044 seconds.\n",
      "Processed 2000000 sentences, used 141.5260 seconds.\n",
      "Processed 2100000 sentences, used 148.4213 seconds.\n",
      "Processed 2200000 sentences, used 156.6541 seconds.\n",
      "Processed 2300000 sentences, used 163.4629 seconds.\n",
      "Processed 2400000 sentences, used 170.0118 seconds.\n",
      "Processed 2500000 sentences, used 176.5172 seconds.\n",
      "Processed 2600000 sentences, used 183.3974 seconds.\n",
      "Processed 2700000 sentences, used 190.2109 seconds.\n",
      "Processed 2800000 sentences, used 198.9971 seconds.\n",
      "Processed 2900000 sentences, used 205.7681 seconds.\n",
      "Processed 3000000 sentences, used 212.0406 seconds.\n",
      "Processed 3100000 sentences, used 219.5196 seconds.\n",
      "Processed 3200000 sentences, used 226.2469 seconds.\n",
      "Processed 3300000 sentences, used 232.9807 seconds.\n",
      "Processed 3400000 sentences, used 240.0532 seconds.\n",
      "Processed 3500000 sentences, used 248.7398 seconds.\n",
      "Processed 3600000 sentences, used 256.0775 seconds.\n",
      "Processed 3700000 sentences, used 263.1950 seconds.\n",
      "Processed 3800000 sentences, used 270.2908 seconds.\n",
      "Processed 3900000 sentences, used 277.2883 seconds.\n",
      "Processed 4000000 sentences, used 283.2971 seconds.\n",
      "Processed 4100000 sentences, used 290.5568 seconds.\n",
      "Processed 4200000 sentences, used 297.2629 seconds.\n",
      "Processed 4300000 sentences, used 303.4168 seconds.\n",
      "Processed 4400000 sentences, used 313.6999 seconds.\n",
      "Processed 4500000 sentences, used 319.8846 seconds.\n",
      "Processed 4600000 sentences, used 326.1014 seconds.\n",
      "Processed 4700000 sentences, used 333.6452 seconds.\n",
      "Processed 4800000 sentences, used 340.4382 seconds.\n",
      "Processed 4900000 sentences, used 347.0680 seconds.\n",
      "Processed 5000000 sentences, used 353.7299 seconds.\n",
      "Processed 5100000 sentences, used 360.5382 seconds.\n"
     ]
    }
   ],
   "source": [
    "data, dictionary = tokenize_and_build_dictionary(df[0].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total words: 78896214; Vocabulary size: 490844\n"
     ]
    }
   ],
   "source": [
    "total_words = sum([len(d[\"m_ids\"]) + len(d[\"r_ids\"]) for d in data])\n",
    "dict_size = len(dictionary)\n",
    "\n",
    "print(\"Total words: {}; Vocabulary size: {}\".format(total_words, dict_size))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Select Vocabulary Size Threshold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(3427659, '.'),\n",
       " (2149626, 'the'),\n",
       " (1679820, ','),\n",
       " (1653380, 'to'),\n",
       " (1522926, 'i'),\n",
       " (1414138, 'a'),\n",
       " (1224043, '!'),\n",
       " (1091332, 'you'),\n",
       " (1016107, 'and'),\n",
       " (973584, 'is'),\n",
       " (930598, '?'),\n",
       " (927294, 'of'),\n",
       " (822498, 'in'),\n",
       " (737845, 'for'),\n",
       " (736758, 'it'),\n",
       " (698561, 'that'),\n",
       " (591339, 'this'),\n",
       " (537695, 'on'),\n",
       " (511206, '\"'),\n",
       " (481721, 'my'),\n",
       " (417829, 'be'),\n",
       " (415003, 'not'),\n",
       " (413289, 'but'),\n",
       " (392080, 'are'),\n",
       " (391168, 'have'),\n",
       " (379540, 'was'),\n",
       " (377057, 'so'),\n",
       " (368757, 'with'),\n",
       " (353307, 'me'),\n",
       " (311717, 'just'),\n",
       " (310173, '...'),\n",
       " (303278, 'like'),\n",
       " (301573, 'we'),\n",
       " (294072, 'at'),\n",
       " (288788, 'your'),\n",
       " (287566, 'what'),\n",
       " (287371, 'he'),\n",
       " (281890, 'they'),\n",
       " (276753, \"i'm\"),\n",
       " (272696, 'if'),\n",
       " (272233, 'all'),\n",
       " (266775, ':'),\n",
       " (260933, \"it's\"),\n",
       " (245940, 'about'),\n",
       " (245561, '&'),\n",
       " (239664, 'do'),\n",
       " (234406, '…'),\n",
       " (228516, 'as'),\n",
       " (222962, 'trump'),\n",
       " (218463, 'no'),\n",
       " (211724, 'out'),\n",
       " (210307, \"don't\"),\n",
       " (207928, 'up'),\n",
       " (205857, 'can'),\n",
       " (204347, 'will'),\n",
       " (202016, 'get'),\n",
       " (201805, 'one'),\n",
       " (194883, 'from'),\n",
       " (192560, 'how'),\n",
       " (191592, '-'),\n",
       " (189060, 'or'),\n",
       " (178137, 'when'),\n",
       " (178088, 'know'),\n",
       " (169052, '/'),\n",
       " (168047, 'who'),\n",
       " (166524, 'an'),\n",
       " (166233, 'now'),\n",
       " (164588, 'people'),\n",
       " (162022, 'good'),\n",
       " (154911, 'more'),\n",
       " (153561, 'his'),\n",
       " (151937, 'by'),\n",
       " (151705, 'think'),\n",
       " (150194, 'has'),\n",
       " (148791, 'there'),\n",
       " (148560, 'would'),\n",
       " (142092, ')'),\n",
       " (140522, 'time'),\n",
       " (139845, '😂'),\n",
       " (134369, 'lol'),\n",
       " (134002, '('),\n",
       " (132700, \"'\"),\n",
       " (131119, 'too'),\n",
       " (130797, 'u'),\n",
       " (128651, 'why'),\n",
       " (128384, 'she'),\n",
       " (126880, 'love'),\n",
       " (126652, 'see'),\n",
       " (125711, \"that's\"),\n",
       " (122615, 'her'),\n",
       " (122437, 'them'),\n",
       " (120504, '’'),\n",
       " (115386, 'him'),\n",
       " (114471, 'been'),\n",
       " (113654, 'go'),\n",
       " (113017, 'right'),\n",
       " (111475, 'did'),\n",
       " (107790, 'some'),\n",
       " (107684, 'really'),\n",
       " (105022, 'need'),\n",
       " (105001, 'only'),\n",
       " (104552, 'got'),\n",
       " (103981, 'our'),\n",
       " (103748, 'had'),\n",
       " (102614, 'us'),\n",
       " (101511, 'want'),\n",
       " (100776, 'should'),\n",
       " (99724, 'still'),\n",
       " (98749, 'going'),\n",
       " (98514, \"you're\"),\n",
       " (97743, 'much'),\n",
       " (97639, \"can't\"),\n",
       " (96945, 'new'),\n",
       " (96700, 'their'),\n",
       " (93882, 'than'),\n",
       " (93179, 'great'),\n",
       " (92280, 'even'),\n",
       " (91709, 'make'),\n",
       " (91685, 's'),\n",
       " (91070, 'back'),\n",
       " (89797, 'because'),\n",
       " (87933, 'here'),\n",
       " (86510, 'never'),\n",
       " (84081, 'were'),\n",
       " (83497, 'day'),\n",
       " (83286, 'thanks'),\n",
       " (83185, 'then'),\n",
       " (81854, '2'),\n",
       " (81509, 'say'),\n",
       " (81161, '..'),\n",
       " (81036, 'way'),\n",
       " (79254, 'well'),\n",
       " (78660, '️'),\n",
       " (77996, 'over'),\n",
       " (77498, 'also'),\n",
       " (76853, '*'),\n",
       " (75252, 'thank'),\n",
       " (72610, 'am'),\n",
       " (72229, 'any'),\n",
       " (71696, 'being'),\n",
       " (70378, 'could'),\n",
       " (70298, \"he's\"),\n",
       " (69533, 'yes'),\n",
       " (68349, 'said'),\n",
       " (68219, 'work'),\n",
       " (67755, 'better'),\n",
       " (67364, 'very'),\n",
       " (67348, \"didn't\"),\n",
       " (66720, 'first'),\n",
       " (66568, 'these'),\n",
       " (66257, 'last'),\n",
       " (66002, 'today'),\n",
       " (65946, 'where'),\n",
       " (65848, 'off'),\n",
       " (65414, 'take'),\n",
       " (65193, 'after'),\n",
       " (65096, 'man'),\n",
       " (64586, 'hillary'),\n",
       " (64470, 'best'),\n",
       " (63815, 'same'),\n",
       " (62871, 'thing'),\n",
       " (62202, 'look'),\n",
       " (61651, 'oh'),\n",
       " (61538, 'w'),\n",
       " (61375, 'other'),\n",
       " (61088, '$'),\n",
       " (60808, \"i've\"),\n",
       " (60385, 'shit'),\n",
       " (60186, 'always'),\n",
       " (59953, 'sure'),\n",
       " (59795, 'does'),\n",
       " (59320, 'many'),\n",
       " (59218, 'those'),\n",
       " (58463, 'into'),\n",
       " (56745, 'down'),\n",
       " (56703, 'year'),\n",
       " (56568, 'yeah'),\n",
       " (56309, 'come'),\n",
       " (55368, 'happy'),\n",
       " (54601, 'bad'),\n",
       " (53957, 'most'),\n",
       " (53744, 'years'),\n",
       " (53725, 'ever'),\n",
       " (53708, 'hope'),\n",
       " (53499, 'please'),\n",
       " (52543, 'every'),\n",
       " (52423, 'game'),\n",
       " (52350, 'next'),\n",
       " (52296, 'its'),\n",
       " (52131, \"i'll\"),\n",
       " (51780, 'again'),\n",
       " (51464, 'show'),\n",
       " (51456, \"doesn't\"),\n",
       " (50485, 'feel'),\n",
       " (50314, 'something'),\n",
       " (50105, 'actually'),\n",
       " (49591, 'life'),\n",
       " (48399, 'someone'),\n",
       " (48365, 'before'),\n",
       " (48311, 'vote')]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort dictionary by word frequency\n",
    "word_freq = [(dictionary[key][1], key) for key in dictionary.keys()]\n",
    "words_sort_by_freq = sorted(word_freq, reverse=True)\n",
    "freq = np.asarray(words_sort_by_freq)[:, 0].astype(np.int32)\n",
    "\n",
    "words_sort_by_freq[:200]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Keep the most frequent 50000 words will drop 1066692 tokens which is 1.352 percent.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(19, 'reputed')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set the vocabulary size threshold\n",
    "num_keep_words = 50000\n",
    "\n",
    "# number of words that will be dropped\n",
    "num_words_drop = freq[num_keep_words:].sum()\n",
    "percent_drop = num_words_drop / total_words\n",
    "\n",
    "info = \"Keep the most frequent {} words will drop {} tokens \"\n",
    "info += \"which is {:.3f} percent.\"\n",
    "print(info.format(num_keep_words, num_words_drop, 100 * percent_drop))\n",
    "\n",
    "words_sort_by_freq[num_keep_words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.legend.Legend at 0x7f2aa9006198>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtwAAAHoCAYAAAB6qjhPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xu4VWW99//3N85qIioqgQb7wTRPpaFbdxpiZZgureygiZmZ2NO2MvtVVI9lun9qP8tMt2lYbNppmnZUUekASrb1t0WtTN2WiSl4QAUxkwTk+/wx51otFmvBWGvNseaB9+u65rXWvMc9x/jOcaF8vL3HfUdmIkmSJKkcr6h3AZIkSVIrM3BLkiRJJTJwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUaXO8CaiUi2oC2V77ylSe/5jWvqXc5kiRJanF33XXXM5k5emP9otW2dp80aVIuXLiw3mVIkiSpxUXEXZk5aWP9nFIiSZIklcjALUmSJJXIwC1JkiSVyMAtSZIklcjALUmSJJXIwC1JkiSVqGXW4ZYkSa3l+eefZ+nSpaxevbrepWgTMnjwYIYPH87o0aMZPnx4bc5Zk7NIkiTV0PPPP89TTz3F2LFjGTFiBBFR75K0CchM1qxZwwsvvMCjjz7K9ttvz8iRI/t9XgO3JElqOEuXLmXs2LFsttlm9S5Fm5CIYMiQIYwaNYphw4bx5JNP1iRwO4dbkiQ1nNWrVzNixIh6l6FN2IgRI3jppZdqci4DtyRJakhOI1E91fLPn4FbkiRJKpGBW5IkSSqRgVuSJKlkt9xyCxGx3murrbZar+/y5cv58Ic/zLbbbsvmm2/OW97yFu699971+v3973/n05/+NGPGjGHEiBEccMABLFiwYL1+a9eu5dxzz2X8+PEMHz6c173udfzoRz/qts7LL7+cXXfdlWHDhrHLLrtw2WWX9f/LD7DZs2cTETzyyCP1LqVDQwfuiHhFRPy/EXFxRJxQ73okSZL646KLLuL222/veP3yl79c53hm0tbWxs0338zFF1/Mj370I1avXs2UKVNYvHjxOn1POukkLr/8cs466yxuuOEGxowZw9ve9jZ++9vfrtPvjDPO4Mwzz+TUU0/lpptuYv/99+c973kPN9544zr9Lr/8ck455RSOPvpobr75Zt7znvfw0Y9+lEsvvbScm7EpycwBfQGzgKXAH7q0TwUeBB4CZlTb3gl8F7gAeHOR87/hDW9ISZLU3O6///56l1BT8+fPTyB/8YtfbLDfT3/60wRy3rx5HW3PPfdcjho1Kj/2sY91tP32t79NIGfNmtXRtnr16nzNa16TbW1tHW1PPfVUDh06NL/4xS+uc51DDjkk99xzz3U+O3r06PzABz6wTr8TTzwxt9lmm1y1alXvvnCJ1q5dmy+99FKPx//jP/4jgVy0aFG/r7WxP4fAwiyQT+sxwj27Gq47RMQg4BLgMGA34NiI2A3YBfivzDwd+N8DXKckSdKAuu6663jVq17FlClTOtpGjhxJW1sbP/vZz9bpN2TIEN73vvd1tA0ePJhjjjmGuXPndixnN3fuXFatWsW0adPWuc60adO49957WbRoEQC33347Tz/99Hr9jj/+eJ599lluu+22Hmv+2te+xmabbcaqVas62o4++mgiYp0R/Msvv5zBgwfz/PPPd7RdccUVvO51r2P48OFsu+22HH/88TzxxBPrnH/8+PFMmzaNWbNmseuuuzJ06FDmzJkDwMMPP8zhhx/OZpttxujRo/nEJz7R7VJ+3//+99l7773ZYost2HLLLdlzzz351re+1eN3qrUBD9yZuQBY1qV5P+ChzHw4M1cBVwNHAYuB5dU+Lw9clZIkSbV33HHHMWjQILbZZhve//738+ijj65z/L777mOPPfZY73O77747jz76KC+88EJHvwkTJqy3MdDuu+/OqlWreOihhzr6DRs2jIkTJ67XD+D+++/v6Aesd+2u/bozZcoUVq5cyR133AFUZk/ccsstjBgxgnnz5nX0mzdvHm94wxvYcsstAZg5cybHH388r33ta/nxj3/Meeedx9y5c5k8eXLH92w3f/58LrjgAr70pS9x8803s9dee7Fq1Sre+ta3cs8993DJJZcwe/ZsFi1axL/927+t89nbbruNadOmMXnyZH7605/ywx/+kJNPPpnnnnuux+9Ua42y0+RY4LFO7xcD/wx8A7g4Ig4C1n8KoCoipgPTAXbaaacSy9Qm6eCDKz9vuaWeVUiS2rX/e7le+vD3wciRI/nUpz7F5MmT2XLLLbnnnns455xzOOCAA7jnnnvYbrvtAFi2bBnjx49f7/Nbb701UHmgcosttmDZsmWMGjWqx37Lli3r+LnVVlutt6Z0d/2A9c7ZtV93Xv/61zNq1Cjmz5/Pm970Jn73u9+xfPlyTjvtNObPn9/R75ZbbuGEEyqP5L388succcYZHHzwwVx99dUdfXbddVcOOuggZs2axcc//vGO9uXLl3PXXXexww47dLRdfvnlPPzww9x+++3sv//+ABx22GHsueee69R3xx13sNVWW3HhhRd2tB166KE9fp8yNPRDk5n5YmaelJkfy8xLNtBvZmZOysxJo0ePHsgSJUmSNmrvvffmq1/9Km1tbUyePJnTTjuNm2++maeeeoqLLrqo3uX1yyte8QomT57cMZo9b9489tprL97znvewcOFC/vrXv3L//ffz5JNPdkyVefDBB1m6dCnHHXfcOuc68MADefWrX82tt966Tvv++++/TtiGyjSYHXfcsSNst9fy3ve+d51+++67L8uXL2fatGnccMMNAzqy3a5RRriXADt2ej+u2iZJkrSuFvk/jvvssw+vec1ruPPOOzvaRo0axfLly9fr23UEetSoUfzlL3/psV/7yPSoUaN47rnnyMx1Rrm76weVkeQxY8b02K8nU6ZM4dOf/jQrV65k/vz5TJkyhX333Zfhw4fz61//mkWLFjFkyBAOPPDAdc7b+Vrtdthhh/VG1Lvr98QTT7D99tuv1961bfLkyVx77bVcfPHFvPOd7+xou+CCC9hrr702+L1qpVFGuO8Edo6ICRExFDgGuK43J4iItoiYuWLFilIKlCRJKkPnILz77rt3zKfu7P7772ennXZiiy226Oi3aNEiXnzxxfX6DR06tGPO9u67785LL73En//85/X6Aey2224d/YD1rt21X0+mTJnCqlWrWLBgAQsWLOCQQw5h8ODBHHTQQcybN4958+ax3377sfnmmwP/CPBPPvnkeud68skn1wv43W2zPmbMGJ566qn12rtre/e7382tt97K8uXL+clPfsITTzzB1KlTWbt27Qa/V60MeOCOiKuA24FdImJxRJyUmWuAU4G5wAPANZm5/p+2DcjM6zNz+siRI2tftCRJUo0tXLiQBx98kP3226+j7cgjj2TJkiXrTKl4/vnnuf766znyyCM72tra2li9ejXXXnttR9uaNWv4wQ9+wKGHHsqwYcMAmDp1KkOGDOHKK69c59pXXHEFe+yxBxMmTADggAMOYNttt+2239Zbb80b3/jGDX6XPfbYg9GjR3P++efzt7/9jcmTJwNwyCGH8Ktf/Ypbb711nZVXdtllF7bffvt15m8D/Nd//Rd/+ctfOLjAPP0DDjiAxx57rONhTahs8nPNNdf0+JktttiCI444glNOOYUnnniCZ599dqPXqYUBn1KSmcf20H4jcGN3xyRJkprZcccdx4QJE9hnn33YaqutuOeeezj33HMZO3bsOg8HHnnkkRxwwAFMmzaN888/n1GjRnHuueeSmXzmM5/p6Lf33nvzvve9j9NOO43Vq1czYcIELr30UhYtWrROaN5uu+04/fTTOffcc3nlK1/JPvvsww9+8APmzZvHddf9YzLBkCFDOPvss/noRz/K2LFjectb3sK8efOYNWsWF198MUOHDt3g94sIDj74YK699lr23XffjpVI2qeaQCV8txs0aBBnnXUWp5xyCtOmTWPatGksWbKEL3zhC+y888586EMf2ug9PeGEEzjvvPN417vexTnnnMN2223HZZddts6ygwBf/OIXeeqpp5gyZQqvetWrWLx4MRdddBGvf/3rGbBn/4os1t0ML6ANmDlx4sQNLlAu9drkyZWXJGnAtNrGN+ecc07uueeeueWWW+bgwYNz3LhxefLJJ+fjjz++Xt9nn302TzzxxBw1alSOGDEiDznkkPztb3+7Xr8XX3wxP/nJT+b222+fw4YNy/322y/nz5+/Xr81a9bk2WefnTvttFMOHTo099xzz7z22mu7rfOyyy7LnXfeOYcOHZoTJ07MSy65pPB3/OY3v5lAfvazn+1oe/nll3PUqFE5bNiwXLly5Xqf+d73vpd77bVXDh06NLfeeuucNm3aevfk1a9+dR533HHdXvPPf/5zHnbYYTlixIjcdttt8+Mf/3hedtll62x8c8MNN+Shhx6aO+ywQw4dOjTHjRuXH/rQh3LJkiUb/U612vgmKn1bx6RJk3LhwoX1LkOtxGUBJWnAPfDAA7z2ta+tdxnaxG3sz2FE3JWZkzZ2nkZ5aFKSJElqSQZuSZIkqUQtE7hdFlCSJEmNqGUCd7osoCRJkhpQywRuSZIkqREZuCVJUkNqtZXU1Fxq+eevZQK3c7glSWodQ4YMYeXKlfUuQ5uwlStXduzY2V8tE7idwy1JUuvYbrvtWLJkCS+++KIj3Rowmcnq1atZtmwZixcvZptttqnJeQd8a3dJkqSNad8a/PHHH2f16tV1rkabksGDBzN8+HB22mknhg8fXptz1uQskiRJNbblllt2BG+pmbXMlBJJkiSpERm4JUmSpBK1TOB2lRJJkiQ1opYJ3K5SIkmSpEbUMoFbkiRJakQGbkmSJKlEBm5JkiSpRAZuSZIkqUQtE7hdpUSSJEmNqGUCt6uUSJIkqRG1TOCWJEmSGpGBW5IkSSqRgVuSJEkqkYFbkiRJKpGBW5IkSSqRgVuSJEkqUcsEbtfhliRJUiNqmcDtOtySJElqRC0TuCVJkqRGZOCukfEz5jB+xpx6lyFJkqQGY+CWJEmSSmTgliRJkkpk4JYkSZJKZOCWJEmSSmTgliRJkkpk4JYkSZJKZOCWJEmSSmTgliRJkkrUMoE7ItoiYuaKFSvqXYokSZLUoWUCd2Zen5nTR44cWe9SJEmSpA4tE7glSZKkRmTgliRJkkpk4JYkSZJKZOCWJEmSSmTgliRJkkpk4JYkSZJKZOCWJEmSSmTgliRJkkpk4JYkSZJKZOCWJEmSSmTgliRJkkpk4JYkSZJKZOCWJEmSSmTgliRJkkrU0IE7Ig6OiF9HxGURcXC965EkSZJ6a8ADd0TMioilEfGHLu1TI+LBiHgoImZUmxN4ARgOLB7oWiVJkqT+qscI92xgaueGiBgEXAIcBuwGHBsRuwG/zszDgM8CXx7gOiVJkqR+G/DAnZkLgGVdmvcDHsrMhzNzFXA1cFRmrq0eXw4MG8AyJUmSpJoYXO8CqsYCj3V6vxj454h4F/A2YCvg33v6cERMB6YD7LTTTiWWKUmSJPVOowTubmXmj4EfF+g3E5gJMGnSpCy7LkmSJKmoRlmlZAmwY6f346ptkiRJUlNrlMB9J7BzREyIiKHAMcB1vTlBRLRFxMwVK1aUUqAkSZLUF/VYFvAq4HZgl4hYHBEnZeYa4FRgLvAAcE1m3teb82bm9Zk5feTIkbUvWpIkSeqjAZ/DnZnH9tB+I3DjAJcjSZIklapRppT0m1NKJEmS1IhaJnA7pUSSJEmNqGUCtyRJktSIDNySJElSiVomcDuHW5IkSY2oZQK3c7glSZLUiFomcG9qxs+Yw/gZc+pdhiRJkjbCwC1JkiSVqGUCt3O4JUmS1IhaJnA7h1uSJEmNqGUCtyRJktSIDNySJElSiQzckiRJUolaJnD70KQkSZIaUcsEbh+alCRJUiNqmcAtSZIkNSIDtyRJklQiA7ckSZJUIgO3JEmSVCIDtyRJklSilgncLgsoSZKkRtQygdtlASVJktSIWiZwS5IkSY3IwC1JkiSVyMAtSZIklcjALUmSJJXIwC1JkiSVyMAtSZIklahlArfrcEuSJKkRtUzgdh1uSZIkNaKWCdySJElSIzJwb6LGz5jD+Blz6l2GJElSyzNwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklapnAHRFtETFzxYoV9S5FkiRJ6tAygTszr8/M6SNHjqx3KZIkSVKHlgnckiRJUiMycEuSJEklKhS4I2LbiNipS9spEXFxRBxRTmmSJElS8ys6wj0LmNH+JiLOAC4F3g/8LCLeV0JtkiRJUtMrGrgnAb/q9P4jwDmZuQ1wCXB6rQtTYxk/Yw7jZ8ypdxmSJElNp2jg3hp4CiAi9gB2AL5bPfZTYJfalyZJkiQ1v6KB+1lgXPX3Q4DHM/NP1fdDenEeSZIkaZMyuGC/XwJnRsS2wKeojGq32xX4S60LkyRJklpB0ZHpzwCPAecCfwa+3OnYccBtNa5LkiRJagmFRrgz8yngrT0cfgvw95pVJEmSJLWQolNKepSZz9eiEEmSJKkV9Ri4I2JWL86TmXlSDeqRJEmSWsqGRrgPAbLT+62AkcAaKquWbFP9/ApgeVkFSpIkSc2sx4cmM3N8Zk7IzAnA8cALwDHAiMwcA4wAjgX+CkwbiGIlSZKkZlN0lZILgHMz85rMfBkgM1/OzB8AXwEuLKvAiNg8IhZGxBFlXUOSJEkqS9HAvSfwUA/H/gTsUfSCETErIpZGxB+6tE+NiAcj4qGImNHp0GeBa4qeX/Xj1u+SJEnrKxq4nwTe28OxY6hu+17QbGBq54aIGARcAhwG7AYcGxG7RcRbgfuBpb04vyRJktQwii4LeCHw9YgYA1xLJWBvTyWEvw04regFM3NBRIzv0rwf8FBmPgwQEVcDRwFbAJtTCeErI+LGzFzb9ZwRMR2YDrDTTjsVLUWSJEkqXdGNb74RES8AX6IyCt3uMeDkzOzNEoLdGVs9V7vFwD9n5qkAEfFB4Jnuwna1vpnATIBJkyZld30kSZKkeii88U1mfqe6Nvc4YAzwBLA4M0sPuJk5u+xrSJIkSWXY6BzuiBgaEXdHxKFZ8Vhm/nf1Z63C9hJgx07vx1XbCouItoiYuWLFihqVJEmSJPXfRgN3Zq4CJlDZ8KYsdwI7R8SEiBhK5UHM63pzgsy8PjOnjxw5spQCJUmSpL4oukrJL4BDa3HBiLgKuB3YJSIWR8RJmbkGOBWYCzwAXJOZ99XiepIkSVI9FZ3DfTFwRUQMBn5KZf72OtNJ2lcY2ZjMPLaH9huBGwvWI0mSJDWFooH71urP04FP9tBnUP/L6buIaAPaJk6cWM8y1IP2TXEeOe/wOlciSZI0sIoG7hNLraIGMvN64PpJkyadXO9aJEmSpHZF1+H+btmFSJIkSa2o8DrcABERVHZ93BpYBtw/EOtwS5IkSc2q6ColRMSHqTws+XvglurPxyPipHJK6x3X4ZYkSVIjKhS4I+I4Klun3wt8CHh79ee9wMyI6HblkYHkOtySJElqREWnlHwGuDIzj+/S/t2I+B7wWeCqmlYmSZIktYCiU0p2Aa7o4dgV1eNSr42fMadjyUBJkqRWVDRw/xUY18OxcdXjdeUcbkmSJDWiooH7JuCciDioc2NEHAD8W/V4XTmHW5IkSY2oN3O49wduiYglVFYr2YHK6PZD1eOSJEmSuii68c2TEfF6KiuTHERlHe5HqGz5PjszXyytQkmSJKmJFd74phqq/736kiRJklRA0XW4vxIRh0bEZmUX1Fc+NNlaXL1EkiS1iqIPTR4H3Awsj4jfRMTZEXFIRAwrsbZe8aFJSZIkNaJCgTszxwG7Ap8AFgPTgV8Cz0XE/Ig4o7wSJUmSpOZVdISbzPxjZl6Wme/LzO2pPDx5GzAZOLOk+iRJkqSmVvihyYgYARwIHAJMAfYBXgRuAOaVUp0kSZLU5AoF7ohYAOwHrAZ+A/wE+BhwV2auLa88SZIkqbkVHeE+EFgJ/CcwF7g1M10ORAOqfdWSR847vM6VSJIkFVd0DvdewOeAVwGzgWci4s7qcoFTG2G5QJcFlCRJUiMqukrJHzLzosx8J7AN8M/AD6o/5wDLyiuxGJcFlCRJUiMq/NAkQEQMAf6FykOTh1AJ3AEsr31pkiRJUvMrutPk5yPil8BzwHzgX4GlwCeB3TNzTHklSt1zJ0pJktQMio5wfxpYAHwBmJeZvy+vJEmSJKl1FA3c27j8nyRJktR7RR+aNGxLkiRJfVB4a3dJkiRJvdcygdt1uOVDlJIkqRG1TOB2HW5JkiQ1oh4Dd0R8PCK2q/6+U3UNbkmSJEm9sKER7q8D46u/LwL2Lr0aSZIkqcVsKHA/B+xQ/T2ALL8cqTbGz5jjnG5JktQQNrQO92+A70bE76rvL42I53vom5n55tqWJkmSJDW/DY1wnwxcBaylMro9GBjSw2touWVKkiRJzanHEe7MfAr4KEBErAWmZ+Z/D1RhUi2NnzGHR847vN5lSJKkTVDRrd0nAE+UWYgkSZLUigoF7sz8C0BEHAFMBrYGlgHzM/PG8sqTaqv9QUpHuyVJ0kApFLgj4pXADcBBwBrgWWAb4PSI+DVwRGa+UFqVkiRJUpMqutPkOcA+wPHAiMwcA4wAPlBtP6ec8iRJkqTmVjRwHw38n8y8MjNfBsjMlzPzSuCM6vG6ioi2iJi5YsWKepciSZIkdSgauLcB7u/h2P3V43WVmddn5vSRI0fWuxRJkiSpQ9HAvQg4oodjb68el5qKO1FKkqSBUHRZwG8BX4uILYArqSwRuANwDPBh4PRyypMkSZKaW9FlAb8eEaOpBOsPVpsDWAWcl5nfKKc8aWC4XKAkSSpL0RFuMvPzEXE+sD//WIf7jsxcXlZxkiRJUrMrOocbgMxcnpk3VVcrucmwrVbk3G5JklRLhUe4pU1NxzST+pYhSZKaXK9GuCVJkiT1joFbkiRJKpGBW5IkSSrRRgN3RAyNiK9HxL4DUZAkSZLUSjYauDNzFXAKMKL8cqTGc8fDz9a7BEmS1MSKTim5B9izzEIkSZKkVlQ0cH8K+H8i4oiIiDILkiRJklpJ0cB9LbAN8DNgZUQ8FhGPdnr9pbwSpcbQvi73+Blz3BxHkiQVVnTjm18BWWYh3YmI1wKfALYFfpWZlw50DZIkSVJ/FArcmfnBWl0wImYBRwBLM3OPTu1TgW8Ag4BvZ+Z5mfkA8JGIeAXwn4CBWy2rY2fL8w6vcyWSJKmW6rEO92xgaueGiBgEXAIcBuwGHBsRu1WPHQnMAW4c2DKlDXNaiSRJKqJw4I6IvSPixxHxTESsiYh9qu3nVEenC8nMBcCyLs37AQ9l5sPVZQivBo6q9r8uMw8DjttAbdMjYmFELHz66aeLliJJkiSVrlDgjogDgduBXYHvd/ncWuAj/axjLPBYp/eLgbERcXBEXBQR32IDI9yZOTMzJ2XmpNGjR/ezFKl3HOmWJEkbUvShyfOAucA7qMyxPrXTsbuBD9S4LgAy8xbgljLOLdWS868lSVJPik4p2Qe4NDOT9VcreQbo77DyEmDHTu/HVdsKi4i2iJi5YsWKfpYiSZIk1U7RwP13YLMejo0B+pty7wR2jogJETEUOAa4rjcnyMzrM3P6yJEj+1mKJEmSVDtFA/dtwGnV1UTatY90nwTMK3rBiLiKynzwXSJicUSclJlrqExTmQs8AFyTmfcVPafUKJzPLUmSuio6h/sM4DfA74AfUgnbJ0TEBcAbgH2LXjAzj+2h/UZc+k8tYvyMOc7nliRJQPGNb34XEW8Czge+AASVEelfA5Mz88HySiwmItqAtokTJ9a7FAlYd7S7P+HbBzIlSWpuhdfhzsy7M/PNwCupPNS4ZWZOycx7SquuF5zDLUmSpEbU650mM/PvwOrMfLGEeiRthPPEJUlqLr3ZaXJyRNwaESuBJyNiZUTcUp1qIqkAw7IkSZueojtNvofKSiTbUZnH/XHgq8D2wLyIeHdpFRbkOtySJElqREVHuM8C5gC7Z+YXM/OSzDwD2B24GTi7rAKLcg63GllZI9vt53XkXJKkxlU0cE+gstPk2s6N1fffBMbXuC6pZRmOJUnatBQN3H+i5+3bRwMP1aYcadMwfsYcg7ckSZuIooH7C8CXI2KdDW4i4p+BM4HP1biuXnMOtyRJkhpRjxvfRMSCLk3DgTsi4jHgKSoPTO4ILAU+DdxQVpFFZOb1wPWTJk06uZ51SJIkSZ1taIR7LfByp9f/AAuARcCL1Z8Lqu1reziHpA1wWokkSa2vxxHuzDx4AOuQNmn1DN7jZ8xx23hJkkrU650mJdVWLcO2D2NKktR4ehzh7ioiBgMHUJm3Pbzr8cycVcO6ei0i2oC2iRMn1rMMqd8ccZYkqbUUCtwRsQ/wE2AcEN10SaCugduHJqV/aA/t7aPdBnhJkuqn6JSSy4AXgHcAu1DZCKfz659KqU7SgHI6iiRJtVd0SsluwHsz88Yyi5FUUetpJY50S5JUP0UD9x+BzcssRNK6HG2WJKk1FJ1S8nng/0TETmUWI6lchnhJkgZeocCdmTcDNwF/ioh7I2JBl9et5ZYpaaBtKJwb3CVJKq5Q4I6IGcBngOeA51l3B8qXaYCdJiOiLSJmrlixot6lSA2vaGCudT9JkjZFRedwnwZ8Czg1M18usZ4+c1lAqZj2cNybMO3DlpIk9V3ROdybAdc2atiWNgWNMIrcCDVIktRsigbum6jsMimpAdR6C/daB2m3mJck6R+KBu4LgQ9ExBci4g0R8U9dX2UWKak5FAnZBnFJ0qam6Bzu31R/ng2c1UOfQf0vR1Jv1SrANtp5JElqFUUD94eALLMQSc3BQC1JUu8UCtyZObvkOiQV0Ghhd2P1tK9w4konkqRNWdE53A3Pdbil2ij6wKNreUuSVEyhEe6ImLWRLpmZJ9Wgnj5zHW6pvmo5B9zRcElSKyk6h/sQ1p/DvTXwSiq7Tz5Xy6IktZ7uAnkZ4doP1kS+AAAYc0lEQVTALklqNIWmlGTm+Myc0OU1EjgYeBI4uswiJW1cs03JcAlBSdKmol9zuDNzAfB14OLalCOpVpolrBaZM94s30WSpO7U4qHJh4G9a3AeSQ2qkXe2NIxLkhpdvwJ3RAwGPggsrkk1kjZpGwrPnY/11M/wLUlqREVXKZnXTfNQ4DXANsBHalmUJHWna+je0MOR7X19gFKSVG9FVyl5BeuvUvJX4MfA1Zl5Sy2LkrRpKzKa3ZdzGr4lSfVQdKfJg0uuQ5J6zZVOJEnNoGV2mpQkSZIaUdEpJUTElsDbgZ2A4V0OZ2aeXcvCeisi2oC2iRMn1rMMSXXgKLYkqZEVfWjyjcD1wFY9dEmgroHbrd0l1YrzvSVJtVR0hPtC4BHgZODezFxVWkWS1E9FR7zbg3WRjXcM4JKkvioauF8LvDcz7yqzGEkqW9flAp2OIkkqW9GHJh8FhpVZiCQNJIO2JGmgFA3cXwZmVB+clKSmVMs1vQ3skqSiik4pOQLYHlgUEbcDy7ocz8w8oaaVSVIdOW9bklQrRUe4D6SyEsnzwO7AQd28JKnl9WZku2tfR8UladNUKHBn5oSNvP6p7EIlqdH0FKAN1pKkztxpUpJ60HWutgFbktQXBm5JqiFDuSSpq8Jbu0uS1tcepHuar91d0O66FrgkqbU5wi1JBZQxQu1ouCRtGgzckjSADNOStOkxcEuSJEklMnBLUp31tHOlo+GS1BoKPTQZEWupbHzTnQRWAHcD52fmz2tUmyS1vCLLDkqSmlvREe6zgceAp4HZwFeA71bfLwa+B4wGboqII2pZYES8IyIuj4gfRMShtTy3JDU7Q7okNb6igfvvwCJgfGaelJmfz8wPAROAR6gE732AnwOf39jJImJWRCyNiD90aZ8aEQ9GxEMRMQMgM3+amScDHwHeV/SLSZIkSY2gaOD+CPD1zPx758bMXAl8HfhIZq4Fvg3sVeB8s4GpnRsiYhBwCXAYsBtwbETs1qnL/6kel6RNSndzvHua9y1JajxFA/doYEgPx4YC21R/fwaIjZ0sMxcAy7o07wc8lJkPZ+Yq4GrgqKj4CnBTZt7d3fkiYnpELIyIhU8//XSBryNJzamnhysN35LUuIoG7ruAMyNiTOfGiHgV8CVgYbXp1cDjfaxlLJV54u0WV9s+BrwFeHdEfKS7D2bmzMyclJmTRo8e3cfLS1LjKLJTZU+fM3xLUmMpurX7J4BfAQ9HxB3AUmA74ADgRWBatd9E4Pu1LDAzLwIuquU5JUmSpIFSaIS7OpVjIpX52muBPas/vwbsnJm/rfb7YmZ+qY+1LAF27PR+XLWtkIhoi4iZK1as6OPlJam19HW021FySaqtoiPcZOazFFiBpB/uBHaOiAlUgvYxwPuLfjgzrweunzRp0skl1SdJdbWxEGxIlqTG1KudJiNi64g4PCKOj4i3R8TWfbloRFwF3A7sEhGLI+KkzFwDnArMBR4ArsnM+/pyfknS+jY0L1ySVJ7CI9wR8W/Ap4BhnZpfioivZuYZvbloZh7bQ/uNwI29OZckSZLUyAqNcEfEaVSmk1wBTAFeW/15BfD5iPh4aRUW5BxuSZIkNaLebHzzjcw8OTNvzcwHqz9PprKCyEfLK7GYzLw+M6ePHDmy3qVIUsMrMq3EqSeSVBtFA/d4oKd/886pHpckNThDtCQNvKKB+1lgjx6O7V49XldOKZGk7vVmmb/ebrQjSdq4ooH7J8DZ1dVJBgNExOCIOBY4C/hRWQUW5ZQSSfqH3oRkVy+RpHIVXaXkc8DrgO8CsyJiGbA1MAi4jXLX55YklaAvofyR8w4vqxxJalmFAndm/jUi3gQcDhxEJWwvA24FbsrMLK9ESVLZHN2WpPIU3vgmK27IzM9WVyv5bGbeaNiWpE1Pf+aES9Kmplc7TTYyH5qUpIHROUT35oFMSdpU9Ri4I2JtRLxc8LVmIIvujg9NSlLt9Xaet6ucSNL6NjSH+yzA6SKSJElSP/QYuDPzzAGsQ5LUxBzFlqSetcwcbknSwOpryDacS9rUtEzg9qFJSSpfkbDsg5SStK6WCdw+NClJjWtDAdyALqnVtUzgliQ1l64hu6fQbRiX1OwM3JIkSVKJDNySpFLUemTakW5JzcrALUmSJJWoZQK3q5RIUmPrzcORG9q1sr3dEW9JzaJlArerlEhSa3B7eEmtpmUCtyRp02EAl9RMDNySpAHX07SQvgRp1/GW1OgM3JKklmQQl9QoDNySpJZj0JbUSAzckqRNjoFc0kAycEuSJEklMnBLkiRJJWqZwO3GN5LUepz6IakVtEzgduMbSZIBXVIjapnALUnatBm2JTUqA7ckqWn1JmQbyCXVi4FbktRSDOGSGo2BW5IkSSqRgVuStElzlFtS2QzckqSWt7FQPX7GHIO3pNIYuCVJkqQSDa53AZIklanzyLWj2JLqwRFuSZIkqUQtE7jd2l2S1Bt9Ge12hFxSX7RM4HZrd0mSJDWilgnckiTVQvsodpGVTfrzeUmbDgO3JElVXUPyxpYLNFRLKsLALUmSJJXIwC1JUgGdp4psbKnB/o58O3IutRYDtyRJklQiA7ckSZJUIgO3JEmSVCIDtyRJXfR1U5y+LiUoqbUZuCVJkqQSGbglSdqIZhu5brR6pE2dgVuSJEkqkYFbkqQ66u1otKPXUvMxcEuSJEklMnBLkjSAaj1C7Yi31PgM3JIkSVKJGjpwR8Q/RcR3IuKH9a5FkqT+qOVIdG/O5Qi4VH8DHrgjYlZELI2IP3RpnxoRD0bEQxExAyAzH87Mkwa6RkmSJKlW6jHCPRuY2rkhIgYBlwCHAbsBx0bEbgNfmiRJklRbAx64M3MBsKxL837AQ9UR7VXA1cBRRc8ZEdMjYmFELHz66adrWK0kSf3XbBvnSKqtRpnDPRZ4rNP7xcDYiNgmIi4D9o6Iz/X04cycmZmTMnPS6NGjy65VkiRJKmxwvQvYkMx8FvhIveuQJEmS+qpRAvcSYMdO78dV2wqLiDagbeLEibWsS5KkmuvLKiOPnHd4r6/R/pm+nkNSbTTKlJI7gZ0jYkJEDAWOAa7rzQky8/rMnD5y5MhSCpQkSZL6oh7LAl4F3A7sEhGLI+KkzFwDnArMBR4ArsnM+wa6NkmSJKnWBnxKSWYe20P7jcCNA1yOJEmSVKpGmVLSbxHRFhEzV6xYUe9SJEnqtfEz5nS82t93PV6La5TxGZc1lDasZQK3c7glSZLUiFomcEuSJEmNqGUCt1NKJEmtZkNTNTpPP9lQW2/P21dOK5F61jKB2yklkiRJakQtE7glSZKkRmTgliRJkkpk4JYkSZJK1DKB24cmJUmtrqcHE3vT3rmt69rfnds2VocPSUrFtUzg9qFJSZIkNaKWCdySJElSIzJwS5IkSSUycEuSJEklGlzvAmolItqAtokTJ9a7FEmSgMbefbGRa+tJe82PnHd4nSuReqdlRrh9aFKSJEmNqGUCtyRJktSIDNySJElSiQzckiRJUokM3JIkSVKJXKVEkiQB62/73t2x3q4Q0vk8nT87fsYcHjnv8PWu058VSNrP2fV3qd5aZoTbVUokSZLUiFomcEuSJEmNyMAtSZIklcjALUmSJJXIwC1JkiSVyMAtSZIklcjALUmSJJXIwC1JkiSVqGUCd0S0RcTMFStW1LsUSZIGXNcNZAbi3ONnzFlvs5z+1tH++Q2dqz/X6Hze3pynL5+R2rVM4HbjG0mSJDWilgnckiRJUiMycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklMnBLkiRJJWqZwO3W7pIk9V0ZW5ZvbGv2Ilu399S36xbtG6q/u2MbOu/GPtNdDT1dt69byfdW0XO7NX19tEzgdmt3SZIkNaKWCdySJElSIzJwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEklGlzvAjYkIjYHvgmsAm7JzCvrXJIkSZLUKwM+wh0RsyJiaUT8oUv71Ih4MCIeiogZ1eZ3AT/MzJOBIwe6VkmSJKm/6jGlZDYwtXNDRAwCLgEOA3YDjo2I3YBxwGPVbi8PYI2SJElSTQx44M7MBcCyLs37AQ9l5sOZuQq4GjgKWEwldMMGao2I6RGxMCIWPv3002WULUmSgPEz5vTrs335/IY+0/VY52t0vV5P52nv1/lzRa+xofMVqbdzW081FLleb9o7n7On71z0vBu63xvSnz9HZZynbI3y0ORY/jGSDZWgPRb4MXB0RFwKXN/ThzNzZmZOysxJo0ePLrdSSZIkqRca+qHJzPwbcGK965AkSZL6qlFGuJcAO3Z6P67aVlhEtEXEzBUrVtS0MEmSJKk/GiVw3wnsHBETImIocAxwXW9OkJnXZ+b0kSNHllKgJEmS1Bf1WBbwKuB2YJeIWBwRJ2XmGuBUYC7wAHBNZt430LVJkiRJtTbgc7gz89ge2m8EbuzreSOiDWibOHFiX08hSZIk1VyjTCnpN6eUSJIkqRG1TOCWJEmSGpGBW5IkSSpRywRulwWUJElSI2qZwO0cbkmSJDWilgnckiRJUiMycEuSJEklMnBLkiRJJWqZwO1Dk5IkSWpELRO4fWhSkiRJjahlArckSZLUiAzckiRJUokM3JIkSVKJIjPrXUNNREQb0Aa8D/hTHUrYFnimDtdtJd7D/vMe9p/3sP+8h/3nPew/72H/eQ837tWZOXpjnVomcNdbRCzMzEn1rqOZeQ/7z3vYf97D/vMe9p/3sP+8h/3nPawdp5RIkiRJJTJwS5IkSSUycNfOzHoX0AK8h/3nPew/72H/eQ/7z3vYf97D/vMe1ohzuCVJkqQSOcItSZIklcjALUmSJJXIwF0DETE1Ih6MiIciYka962k2EbFjRMyPiPsj4r6I+ES9a2pWETEoIu6JiBvqXUszioitIuKHEfE/EfFARBxQ75qaTUR8svrP8R8i4qqIGF7vmhpdRMyKiKUR8YdObVtHxC8i4k/Vn6PqWWOj6+Eenl/9Z/n3EfGTiNiqnjU2uu7uYadjn4qIjIht61FbKzBw91NEDAIuAQ4DdgOOjYjd6ltV01kDfCozdwP2B/7Ve9hnnwAeqHcRTewbwM2ZuSvwOryXvRIRY4GPA5Mycw9gEHBMfatqCrOBqV3aZgC/ysydgV9V36tns1n/Hv4C2CMz9wL+CHxuoItqMrNZ/x4SETsChwKPDnRBrcTA3X/7AQ9l5sOZuQq4GjiqzjU1lcx8IjPvrv7+VyohZ2x9q2o+ETEOOBz4dr1raUYRMRJ4E/AdgMxclZnP1beqpjQYGBERg4HNgMfrXE/Dy8wFwLIuzUcB363+/l3gHQNaVJPp7h5m5s8zc0317R3AuAEvrIn08OcQ4OvAZwBX2egHA3f/jQUe6/R+MYbFPouI8cDewP9f30qa0oVU/qW4tt6FNKkJwNPAf1Sn5Xw7Ijavd1HNJDOXAF+lMhL2BLAiM39e36qa1vaZ+UT19yeB7etZTAv4EHBTvYtoNhFxFLAkM39X71qanYFbDSMitgB+BJyWmc/Xu55mEhFHAEsz865619LEBgP7AJdm5t7A3/B/4/dKdZ7xUVT+4+VVwOYRMa2+VTW/rKzf6+hiH0XEF6hMXbyy3rU0k4jYDPg88MV619IKDNz9twTYsdP7cdU29UJEDKEStq/MzB/Xu54m9EbgyIh4hMq0pkMi4or6ltR0FgOLM7P9/678kEoAV3FvARZl5tOZuRr4MfAvda6pWT0VEWMAqj+X1rmephQRHwSOAI5LNx7prf9F5T+ef1f9u2UccHdE7FDXqpqUgbv/7gR2jogJETGUygNC19W5pqYSEUFl3uwDmXlBvetpRpn5ucwcl5njqfwZnJeZjiz2QmY+CTwWEbtUm94M3F/HkprRo8D+EbFZ9Z/rN+ODp311HXBC9fcTgJ/VsZamFBFTqUyzOzIzX6x3Pc0mM+/NzO0yc3z175bFwD7Vf1eqlwzc/VR9IONUYC6Vv1iuycz76ltV03kjcDyVUdnfVl9vr3dR2iR9DLgyIn4PvB44p871NJXq/x34IXA3cC+Vv2PcGnojIuIq4HZgl4hYHBEnAecBb42IP1H5Pwfn1bPGRtfDPfx34JXAL6p/r1xW1yIbXA/3UDXi1u6SJElSiRzhliRJkkpk4JYkSZJKZOCWJEmSSmTgliRJkkpk4JYkSZJKZOCWpIIi4syIGPClnSJi84j4XkQsjYiMiAsHugZJUt8NrncBkqSN+lfgWOBDwB+BJ+pbjiSpNwzcktT4Xgs8npn/uaFOETEsM18aoJokSQU5pUSS+iEitoyIf4+IxyPipYh4MCI+Wd3avHO/fSLi1xGxMiIei4jPR8SXNzZFpXr8g8CO1ekkGREHV18ZEe+KiMsj4mngqU6fe11EXBcRy6vX/E1EHNTN+T8REY9ExN8jYmFEHFR9P7tTn26n0kTE7Ih4pEvbZhHxlYhYFBGrqj+/EBGv6NSnvfYjq/fumerriojYqsv5BkfEZyPi/mqNT0fEzRGxa0TsUL3GJ7qp7cyIeDEiRm3o/krSQHCEW5L6qBoi5wD7AF+ksp354cAFwGjg89V+2wK/Ah4HTgBWAZ8Exhe4zAHAmcDrgHdW2+6vXhPgYuAm4HhgePV6+wC/Bu4BTgZeBD4C/DIi/iUz76r2Owm4EJgN/ACYCFxFZTvsXouIwcBcYDfgbCr3Y3/gDGBr4FNdPvIN4Abg/cAuwP8HvEzlHrW7GnhHtc5fVr/jm4Axmfk/EfFTYHr1XO11DAJOAq7JzOV9+S6SVEsGbknqu7cDBwInZubsatvPI2Jz4FMRcUFmPgOcDmwGvC0zFwNExFzgkY1dIDPviIhngJcy84729k4D6P+dmR/u8rHzgUeBQzJzVafr/YFK+H1H9T8WzgTmZuaJnc77NJWQ2xfHUrkfkzNzQbXtV9VavxQRX8nMpZ36L8jMj1V//3lE7AJ8OCI+mJkZEYcARwOfyMyLOn3up51+/yYwPyIOysxfV9sOB8YBl/Xxe0hSTTmlRJL67k3AWuD7XdqvAIZSGZ2GyijvHe1hGyAzV1IZHe+vn3R+ExEjgMnAtcDa6pSMwUBQGSF+U7XruOrrmi7n+xGwpo+1TAX+AvxX+3Wr1/45MITKfeis6/e/FxgGbF99fyiQwOU9XTAzb6Ey4n9Kp+ZTgN93/g8USaonA7ck9d3WwLL2UeROnux0HGAMsJT1PdVNW291XbFka2AQlZHs1V1epwKjqqPbY7qrITPXAM/2sZbtgFd3c93/rh7fpkv/ZV3etz/wObxT/2XV/zjZkEuBd0fENhHxairB39FtSQ3DKSWS1HfLgK0jYmiX0L1Dp+NQCcXbdfP57btp662uDzM+R2XU/RKg21VNMnNtRLQH9XVqqI5Idw3Gf68e6/o9u/Z7FlgEvLeHWh/pob0nz1C5vyM2Err/EziXysOlo6jMWb+yl9eSpNI4wi1JfXcrlX+PvqdL+3FUHoy8vfr+DuCAiBjX3qE69ePwWheUmX+j8sDk64C7M3Nh11e162LgMdYPx0ez/mDMX6o/9+hU/1bAv3TpdzOwI/BCd9etzmfvjZ9TmQrTdY76OjLzeSoB+xQqa5VfVW2TpIbgCLck9d1NwG3AZRExGriPyoOUHwbO7RQwLwD+NzA3Ir5MZerE6dWfZexceTqwoHq971AZYd+WysomgzJzRnWU+8vAtyPiP6g8KDkRmAF0Das3ASuAyyPiS1TmWX8GeKFLvyuBE6k8KPk14HdU5rL/L+BI4B2Z+WLRL5GZ8yPiR8AFEbEjMI/KXPA3AXOq87fbfZN/zON2OomkhmLglqQ+qobWw4FzgM9SmWLxCJXAe2Gnfs9ExJuBi6hMf3iWSijcFvhACXXdHRH7Al+qXnMk8DRwN53CaGZ+JyK2qNZ7LJVVTI6l8tBn5/M9FxFHAF+n8pDlYuAs4C3AwZ36rY6It1EJ7dOBCcDfgD9TeUCy61z3Io6hcm9PAE6jEvzvBL7dpcbfR8Qfgecz8+4+XEeSShOZZQyuSJI2pLpW9N3AM5n55nrX01l1M5tbMvODdS6lsOqSgg8AJ2fmd+pdjyR15gi3JA2AiDgbeIjKfOhtqEw72YvKFBT1UXVe/ETgy1SmznRdolGS6s7ALUkDI6nsRvmq6u+/pzKn+aa6VtX8Pkzlvv4ReH+BJQQlacA5pUSSJEkqkcsCSpIkSSUycEuSJEklMnBLkiRJJTJwS5IkSSUycEuSJEkl+r8kWC8gEcBJ8QAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 864x576 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "log_freq = np.log(freq)\n",
    "plt.figure(figsize=(12, 8))\n",
    "plt.hist(log_freq, bins=500, log=True)\n",
    "\n",
    "threshold = np.log(words_sort_by_freq[num_keep_words][0])\n",
    "plt.vlines(x=threshold, colors=\"r\", ymin=0, ymax=10**6,\n",
    "           label=\"{} words\".format(num_keep_words))\n",
    "plt.xlabel(\"log frequency\", fontsize=16)\n",
    "plt.ylabel(\"log number of words\", fontsize=16)\n",
    "plt.legend(fontsize=16)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Process sentences by the new dictionary with predefined vocabulary size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_sentence(sentence, dictionary):\n",
    "    \"\"\"\n",
    "    Process sentence with the new dictionary\n",
    "        1. Drop words not in the dictionary\n",
    "        2. Return a new list of word ids\n",
    "    \"\"\"\n",
    "    processed_sent = []\n",
    "    processed_ids = []\n",
    "    for word in sentence:\n",
    "        if dictionary.get(word) is not None:\n",
    "            processed_sent.append(word)\n",
    "            processed_ids.append(dictionary[word])\n",
    "\n",
    "    return processed_sent, processed_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# build new dictionary\n",
    "dictionary = {}\n",
    "for index, freq_word in enumerate(words_sort_by_freq[:num_keep_words]):\n",
    "    dictionary[freq_word[1]] = index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed total words: 77829522; New vocabulary size: 50000\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(data)):\n",
    "    data[i][\"message\"], data[i][\"m_ids\"] = process_sentence(\n",
    "        data[i][\"message\"], dictionary)\n",
    "\n",
    "    data[i][\"response\"], data[i][\"r_ids\"] = process_sentence(\n",
    "        data[i][\"response\"], dictionary)\n",
    "\n",
    "p_total_words = sum([len(d[\"m_ids\"]) + len(d[\"r_ids\"]) for d in data])\n",
    "p_dict_size = len(dictionary)\n",
    "\n",
    "print(\"Processed total words: {}; New vocabulary size: {}\".format(\n",
    "      p_total_words, p_dict_size))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Export as processed files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_output_form(token_list):\n",
    "    \"\"\"\n",
    "    Transform data into a string output format\n",
    "    \"\"\"\n",
    "    output = \" \".join(np.array(token_list, dtype=str).tolist())\n",
    "\n",
    "    return output\n",
    "\n",
    "\n",
    "def export_data(data):\n",
    "    \"\"\"\n",
    "    Save data as raw text files for messages, responses and their ids\n",
    "    \"\"\"\n",
    "    df = pd.DataFrame(data).applymap(to_output_form)\n",
    "\n",
    "    df[\"message\"].to_csv(\"twitter_message.txt\", header=None, index=None)\n",
    "    df[\"response\"].to_csv(\"twitter_response.txt\", header=None, index=None)\n",
    "\n",
    "    df[\"m_ids\"].to_csv(\"twitter_message_ids.txt\", header=None, index=None)\n",
    "    df[\"r_ids\"].to_csv(\"twitter_response_ids.txt\", header=None, index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "export_data(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
